#Sidak Yntiso
#Extract WA data

###############################
###########Convert RDA#########
#1. Reading/cleaning scripts
#2. Prepare lines for reading
#3. Split lines and read in
#4.Further Fixes
###############################

rm(list=ls())
library(pdftools)
library(dplyr)
library(rvest)
library(RSelenium)
library(readr)
library(stringi)
library(tidyr)


#load the read file
dir <- "C:\\Users\\Sidak Yntiso\\Dropbox\\Sentencing Data\\Other States\\Washington\\Sentencing Data"
load(paste(dir,"\\WA_file_raw_apr2020.RDA",sep=""))


#############################
#1. Reading/cleaning scripts#
#############################
WA_file = paste(dir,"\\P-8928_Yntiso_Final.pdf",sep="")
file1 = pdf_text(WA_file)

#remove first page
file1 <- file1[-1]


strsplits <- function(x, splits, ...)
{
  for (split in splits)
  {
    x <- unlist(strsplit(x, split, ...))
  }
  return(x[!x == ""]) # Remove empty values
}
##races
races <- toupper(c("Other Asian", "Black","Chinese","Cambodian","Filipino",
                   "North American Indian","Guamanian","Hispanic",
                   "Japanese","Korean","Laotian","Other" ,"Pacific Islander",
                   "Samoan" ,"Hawaiian","Vietnamese","White","Asian Indian","UNKNOWN"))
#genders
genders = c(" MALE","FEMALE")
#dates
#get weird punctuation mark from pdf
page1 = file1[1] %>% strsplit(split = "\r\n") #first page
weird = substr(page1[[1]][12],190,190) #random line
months = c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
months = paste(weird,months,weird,sep = "")
#counties
data(county.fips,package="maps")
counties <- toupper(gsub("washington,","",
                         county.fips$polyname[county.fips$fips %in% c(53000:54000)]))
counties <- counties[!counties %in% c("SAN JUAN:LOPEZ ISLAND","SAN JUAN:ORCAS ISLAND"   )]
counties[counties=="SAN JUAN:SAN JUAN ISLAND"]="SAN JUAN"
counties <- counties[!counties %in% c("PIERCE:PENROSE" )]
counties[counties=="PIERCE:MAIN" ]="PIERCE"
counties[counties=="WALLA WALLA" ]="WXLLA WXLLA"
counties[counties=="WAHKIAKUMN" ]="WXHKIAKUM"

#the first county in the string should take priority
check_string <- function(string,test_list){
  result = "NA"; j=1
  while(j <= length(test_list)){
    result = ifelse(test = isTRUE((grep(test_list[j],string))==1),
                    yes = test_list[j], no = result)
    if(!result=="NA"){ #if result has been updated, exit out
      j=length(test_list)+1
    }
    j = j+1
  }
  return(result)
}
get_date <- function(string,test_list){
  result = "NA"; j=1
  while(j <= length(test_list) ){
    result = ifelse(test = isTRUE((grep(test_list[j],string,fixed = T))==1),
                    yes = test_list[j], no = result)
    if(!result=="NA"){ #if result has been updated, exit out
      j=length(test_list)+1
    }
    j = j+1
  }
  if (result=="NA"){
    return = "NA"
  } else{
    jackpot = regexpr(result, string)[[1]]
    return(substr(string,jackpot-2,jackpot+6))
  }
}


get_string_before_date <- function(string,test_list){
  result = "NA"; j=1
  while(j <= length(test_list) ){
    result = ifelse(test = isTRUE((grep(test_list[j],string,fixed = T))==1),
                    yes = test_list[j], no = result)
    if(!result=="NA"){ #if result has been updated, exit out
      j=length(test_list)+1
    }
    j = j+1
  }
  if (result=="NA"){
    return(string)
  } else{
    jackpot = regexpr(result, string)[[1]]
    case_left = substr(string,1,jackpot-2)
    case_right = substr(string,jackpot+7,nchar(string))
    return(list(case_left,case_right))
  }
}

##############################
#2. Prepare lines for reading#
##############################
#remove non-data rows
reduce_dat <- reduce_dat[-grep("P-8928",reduce_dat$contents,perl = T),]
reduce_dat <- reduce_dat[-grep("DOC NUMBER",reduce_dat$contents,perl = T),]

#convert contents into string object
reduce_dat$cleaned_contents = as.character(reduce_dat$contents)

#two counties will conflict with following code
reduce_dat$cleaned_contents  = gsub("WALLA WALLA","WXLLA WXLLA",reduce_dat$cleaned_contents,fixed = T)
reduce_dat$cleaned_contents  = gsub("WAHKIAKUM","WXHKIAKUM",reduce_dat$cleaned_contents,fixed = T)

#keep rows in WA
reduce_dat$cleaned_contents[-c(grep(" WA ",reduce_dat$cleaned_contents))]= NA

#seperate columns by quantity or life
#900 rows have no charge, sentencing information. Identified by ending in 1888
reduce_dat$error <- 0
reduce_dat$error[stringr::str_sub(reduce_dat$cleaned_contents, -5,-1) == " 1888"] <- 1
reduce_dat$cleaned_contents[reduce_dat$error == 1] <- 
  gsub(" 1888","|",reduce_dat$cleaned_contents[reduce_dat$error == 1])
reduce_dat$error = NULL
reduce_dat$cleaned_contents <- 
  gsub("QUANTITY","|",reduce_dat$cleaned_contents)
reduce_dat$cleaned_contents <- 
  gsub(" LIFE ","|",reduce_dat$cleaned_contents)
reduce_dat$cleaned_contents[-grep("|",reduce_dat$cleaned_contents,fixed = T)] <- 
  gsub(" LIFE","|",reduce_dat$cleaned_contents[-grep("|",reduce_dat$cleaned_contents,fixed = T)])
reduce_dat$cleaned_contents <- 
  gsub("FIREARM/UPFA CRIMINAL STREET GANG MEMBER OR ASLIFE"," FIREARM/UPFA CRIMINAL STREET GANG MEMBER OR A |",reduce_dat$cleaned_contents)

#Errors in splitting string (16k observations)
reduce_dat$error = 0
reduce_dat$error[c(grep("|",reduce_dat$cleaned_contents))] = 1
reduce_dat$cleaned_contents[reduce_dat$error==0] <- 
  gsub("QUANTITY","|",reduce_dat$contents[reduce_dat$error==0])
reduce_dat$error[c(grep("|",reduce_dat$cleaned_contents))] = 1
reduce_dat$error = NULL

############################
#3. Split lines and read in#
############################

#keep rows with QUANTITY or LIFE
line1 = strsplit(reduce_dat$cleaned_contents,split="|",fixed = T)

#contents to the left are casedetails
casedetails = as.character(unlist(
  lapply(c(1:length(line1)), function(i) line1[[i]][1])))
#contents to the right are sentencing
sentencing  = as.character(unlist(
  lapply(c(1:length(line1)), function(i) line1[[i]][2])))


#items to the left are case details
#strsplit left side by MALE/FEMALE
reduce_dat$gender = as.character(unlist(
  lapply(c(1:nrow(reduce_dat)), function(i) 
    trimws(check_string(casedetails[i],genders)) )))

#remove state
casedetails = gsub("WA","",casedetails,fixed = T)

#date
reduce_dat$date = as.character(unlist(
  lapply(c(1:nrow(reduce_dat)), function(i) 
    get_date(casedetails[i],months) )))

#split the case into four segments
casedetails <- gsub(" FEMALE "," MALE ",casedetails)
casedetails_split1 = strsplit(casedetails,split=" MALE ",fixed = T)

#objects to the left of the MALE/FEMALE are fixed character
demographics = as.character(unlist(
  lapply(c(1:length(line1)), function(i) casedetails_split1[[i]][1])))

demographics = trimws(demographics)
reduce_dat$case_num = substr(demographics,1,7)
reduce_dat$name = substr(demographics,7,nchar(demographics))
rm(demographics)

#objects to the right hand side split by date
charge_data = as.character(unlist(
  lapply(c(1:length(line1)), function(i) casedetails_split1[[i]][2])))

#without date, there is no use for this data pont
#split string by date. This will prevent errors from
#judges whose names are also countie

case2 = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    get_string_before_date(charge_data[i],months)[1][[1]] )))

case3 = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    get_string_before_date(charge_data[i],months)[2][[1]] )))

#race
reduce_dat$race = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    check_string(case2[i],races) )))

#county
reduce_dat$county = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    check_string(case2[i],counties) )))

rm(case2)

#non regular expressions
case3 = trimws(case3)
case3_split = strsplit(case3,split="  ",fixed = T)

case3_split <- lapply(c(1:length(line1)), function(i) 
    trimws(case3_split[[i]][!case3_split[[i]]==""]) )

lens = as.numeric(unlist(
  lapply(c(1:length(line1)), function(i) 
    length(case3_split[[i]]) )
))

#Easy case
#if there are three or more string left, their order seperates them
reduce_dat$judge = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    case3_split[[i]][1]) ))
#reduce_dat$judge[!lens==3]= NA

reduce_dat$sent_type = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    case3_split[[i]][2]) ))
#reduce_dat$sent_type[!lens==3]= NA
reduce_dat$sent_type[reduce_dat$sent_type==""]=" "

#remove judge and sent_type from case3
charge = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    gsub(reduce_dat$judge[i],"",case3[i], fixed = T)) ))
charge = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    gsub(reduce_dat$sent_type[i],"",charge[i], fixed = T)) ))
reduce_dat$charge =charge
#reduce_dat$charge[!lens==3]= NA


  charge2 = as.character(unlist(
    lapply(c(1:length(line1)), function(i)
      case3_split[[i]][2]) ))
  # reduce_dat$charge[!lens==3]= trimws(charge2[!lens==3])
  # reduce_dat$charge = trimws(reduce_dat$charge)

#items to the right are sentencing details
sent = sentencing %>% strsplit(split = " ")
sent <- lapply(c(1:length(sent)), function(i) 
  trimws(sent[[i]][!sent[[i]]==""]) )

sent_lens = as.numeric(unlist(
  lapply(c(1:length(line1)), function(i) 
    length(sent[[i]]) )
))

#Easy case with four objects - if there are four items - these are confinement + supervision days
#Hard case with three objects it's the last three, with one object it's the first

V1 = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    sent[[i]][1]) ))
V2 = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    sent[[i]][2]) ))
V3 = as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    sent[[i]][3]) ))
V4= as.character(unlist(
  lapply(c(1:length(line1)), function(i) 
    sent[[i]][4]) ))

reduce_dat$TOTALCONFINEMENTDAYS = reduce_dat$TOTALYEARSSUPERVISION=
  reduce_dat$TOTALMONTHSSUPERVISION=reduce_dat$TOTALDAYSSUPERVISION=NA

#confinement is the first of four or the only
reduce_dat$TOTALCONFINEMENTDAYS[sent_lens==4]=V1[sent_lens==4]
reduce_dat$TOTALCONFINEMENTDAYS[sent_lens==1]=V1[sent_lens==1]

#total years supervision is the second if four or the first if three
reduce_dat$TOTALYEARSSUPERVISION[sent_lens==4]= V2[sent_lens==4]
reduce_dat$TOTALYEARSSUPERVISION[sent_lens==3]= V1[sent_lens==3]

#total months supervision is the third if four or the second if three
reduce_dat$TOTALMONTHSSUPERVISION[sent_lens==4]= V3[sent_lens==4]
reduce_dat$TOTALMONTHSSUPERVISION[sent_lens==3]= V2[sent_lens==3]

#total days supervision is the fourth if four or the third if three
reduce_dat$TOTALDAYSSUPERVISION[sent_lens==4]= V4[sent_lens==4]
reduce_dat$TOTALDAYSSUPERVISION[sent_lens==3]= V3[sent_lens==3]


###################################################################
#########################4.Further Fixes###########################
#4.1. Remove sent_type characters from charge and judge fields
#4.2. Remove 1888 lines: these appear to be errors in the main file
#4.3. Remove charge entries from the sent_type field
#4.4. Remove sentencing entries from the charge field
#4.5. Some charges are 1s
###################################################################
###################################################################

###############################################################
#4.1. Remove sent_type characters from charge and judge fields#
###############################################################
gsubs <- c("DOSA PRISON (DOSA 3 & 4)","COMMUNITY - (FOP) OTHR JRSDCTN",
           "SPECIAL SEX OFFENDER(SSOSA)","COMMUNITY CUSTODY BOARD",
           "COMMUNITY","(FOS) OTHR JRSDCTN","CC MISDEMEANOR",
           "SPECIAL SEX OFFENDER(SSOSA)","FIRST TIME OFFENDER",
           "DOSA RESIDENTIAL (DOSA 3)","(FOP) OTHR JRSDCTN",
           "MISDEMEANOR","INSANITY ACQUITTAL"," FOSA", " LRA",
           "CONFINEMENT OVER ONE YEAR","CONFINEMENT UNDER ONE YEAR",
           "PROBATION","PRISON - ISRB", " ISRB"," DOSA (DOSA 2)",
           "LONG TERM JUVENILE BRD","MISDEMEANOR CC")
for (gs in gsubs){
  reduce_dat$judge <- gsub(gs,"",reduce_dat$judge,fixed = T)
}

#fix charge
gsubs <- c(gsubs,gsub(" ","",gsubs),"SUPERVISEDAPPEAL",
           "(FROMOUTOFSTATE) ")
for (gs in gsubs){
  reduce_dat$charge <- gsub(gs,"",reduce_dat$charge,fixed = T)
}
reduce_dat$charge = trimws(reduce_dat$charge)

###################################################################
#4.2. Remove 1888 lines: these appear to be errors in the main file
###################################################################
reduce_dat$error = 0
reduce_dat$error[is.na(reduce_dat$charge)]=1
reduce_dat$error[grep(" 1888",reduce_dat$contents)]=0
#many errors are obvious 
final_dat <- subset(reduce_dat,error==0|error==1&gender %in% c("FEMALE","MALE") )

######################################################
#4.3. Remove charge entries from the sent_type field
######################################################
#15749 lines have missing charge -> many times the charge is in sent_type.
#extract plausible charges from sent_type (these all begin with number)
#of 15749, 1041 already have sentencing information - appending these
nrow(subset(final_dat,charge==""|is.na(charge)))
final_dat$charge[grepl("^[[:digit:]]+", final_dat$sent_type)] <-
  paste(final_dat$sent_type[grepl("^[[:digit:]]+", final_dat$sent_type)],
        final_dat$charge[grepl("^[[:digit:]]+", final_dat$sent_type)],sep="")
final_dat$sent_type[grepl("^[[:digit:]]+", final_dat$sent_type)] <- NA
final_dat$charge[final_dat$sent_type=="AGGRAVATED MURDER JUVENILE BRD"]="AGGRAVATED MURDER"
final_dat$sent_type[final_dat$sent_type=="AGGRAVATED MURDER JUVENILE BRD"]="JUVENILE BRD"

#1070 still have missing charge. 
nrow(subset(final_dat,charge==""|is.na(charge)))
#Loop over observed charges, check if match string
availcharges <- unique(final_dat$charge[grepl("^[[:digit:]]+", final_dat$charge)])
#possible lines
final_dat$error = as.numeric(is.na(final_dat$charge)|final_dat$charge=="")
#dataset to globally change
x_dat <- subset(final_dat,error==1)
availcharges <- c(availcharges,"9 COMMITTING CRIME WHEN ARMED QUANTITY",
                  "SEX OFFENSE (FROM OUT OF STATE)","43 BRIBERY")
#retain availc charge exists in sample
i=0
for (ch in availcharges){ #loop over observed charges
  if (length(grep(ch,x_dat$cleaned_contents,fixed = T))>0){ 
  x_dat$charge[grep(ch,x_dat$cleaned_contents,fixed = T)] = ch #replace charge
  }
  i=i+1
  if(i%%100==1){
    print(i)
  }
}
final_dat <- plyr::rbind.fill(subset(final_dat,error == 0),x_dat)

nrow(subset(final_dat,charge==""|is.na(charge)))

#####################################################
#4.4. Remove sentencing entries from the charge field
#####################################################

#3392 lines have sentencing info that bleeds into charge
final_dat$All_NA <- 
  apply(final_dat[, grep("TOTAL", names(final_dat))], 1, 
        function(x) all(is.na(x)))

##4.4.1. 2175 lines contain the string LIFE##
##1295 contain LIFE in charge
final_dat$error = 0
final_dat$error[grep("LIFE",final_dat$charge)]=1
#extract charge info from charge
xcharges <- final_dat$charge
#if LIFE is in charges, the charge is everything before LIFE 
xcharges[final_dat$error==1] <- 
  as.character(unlist(
  lapply(c(1:length(xcharges[final_dat$error==1])), function(i)
    strsplit(final_dat$charge[final_dat$error==1][i],split="LIFE")[[1]][1])
))
xcharges[final_dat$error==1] <- trimws(xcharges[final_dat$error==1])
final_dat$charge[final_dat$error==1]=xcharges[final_dat$error==1]

#extract sentencing info from charge
xsent <- final_dat$charge
#if LIFE is in charges, the charge is everything before LIFE 
xsent[final_dat$error==1] <- 
  as.character(unlist(
    lapply(c(1:length(xsent[final_dat$error==1])), function(i)
      strsplit(final_dat$charge[final_dat$error==1][i],split="LIFE")[[1]][2])
  ))
xsent[final_dat$error==1] <- trimws(xsent[final_dat$error==1])
xsent <- xsent[final_dat$error==1]

#split xsent by space
xsent = xsent %>% strsplit(split = " ")
for (i in c(1:length(xsent))){
  xsent[[i]] = xsent[[i]][!xsent[[i]]==""]
}

xsent_lens = as.numeric(unlist(
  lapply(c(1:length(xsent)), function(i) 
    length(xsent[[i]]) )
))

#Easy case with four objects - if there are four items - these are confinement + supervision days
#Hard case with three objects it's the last three, with one object it's the first

V1 = as.character(unlist(lapply(c(1:length(xsent)), function(i) 
    xsent[[i]][1]) ))
V2 = as.character(unlist(lapply(c(1:length(xsent)), function(i) 
    xsent[[i]][2]) ))
V3 = as.character(unlist(lapply(c(1:length(xsent)), function(i) 
    xsent[[i]][3]) ))
V4= as.character(unlist(lapply(c(1:length(xsent)), function(i) 
    xsent[[i]][4]) ))

#confinement is the first of four or the only
TOTALCONFINEMENTDAYS = TOTALYEARSSUPERVISION=TOTALMONTHSSUPERVISION=
  TOTALDAYSSUPERVISION = rep(NA,length(xsent))
TOTALCONFINEMENTDAYS[xsent_lens==4]=V1[xsent_lens==4]
TOTALCONFINEMENTDAYS[xsent_lens==1]=V1[xsent_lens==1]
final_dat$TOTALCONFINEMENTDAYS[final_dat$error==1]=TOTALCONFINEMENTDAYS

#total years supervision is the second if four or the first if three
TOTALYEARSSUPERVISION[xsent_lens==4]= V2[xsent_lens==4]
TOTALYEARSSUPERVISION[xsent_lens==3]= V1[xsent_lens==3]
final_dat$TOTALYEARSSUPERVISION[final_dat$error==1]=TOTALYEARSSUPERVISION

#total months supervision is the third if four or the second if three
TOTALMONTHSSUPERVISION[xsent_lens==4]= V3[xsent_lens==4]
TOTALMONTHSSUPERVISION[xsent_lens==3]= V2[xsent_lens==3]
final_dat$TOTALMONTHSSUPERVISION[final_dat$error==1]=TOTALMONTHSSUPERVISION

#total days supervision is the fourth if four or the third if three
TOTALDAYSSUPERVISION[xsent_lens==4]= V4[xsent_lens==4]
TOTALDAYSSUPERVISION[xsent_lens==3]= V3[xsent_lens==3]
final_dat$TOTALDAYSSUPERVISION[final_dat$error==1]=TOTALDAYSSUPERVISION


##4.4.2. 896 contain LIFE in cleaned contents##
final_dat$All_NA <- 
  apply(final_dat[, grep("TOTAL", names(final_dat))], 1, 
        function(x) all(is.na(x)))
final_dat$error = 0
final_dat$error[grep("LIFE",final_dat$contents)]=1
final_dat$error[final_dat$All_NA==FALSE]=0
#these appear to be truly missing sentencing information.

##4.4.3. 1217 do NOT contain LIFE in cleaned contents##
final_dat$error = as.numeric(final_dat$All_NA)
final_dat$error[grep("LIFE",final_dat$contents)]=0
final_dat$error[final_dat$All_NA==FALSE]=0

##1217 do not contain sentencing information 


##########################
#4.5. Some charges are 1s#
##########################
final_dat$charge <- trimws(final_dat$charge)
final_dat$charge[final_dat$charge %in% c("1","1888")]= NA

save(final_dat,file=paste(dir,"\\WA_intermediate.RDA",sep=""))
haven::write_dta(final_dat, paste(dir,"\\WA_file_final_nov20.dta",sep=""))

save(V1)
rm(case3,case3_split,case32,case3_split2,case2,casedetails,county.fips,
   line1,page1,charge,charge_data,charge2,counties,file1,judge2,x,casedetails_split1)
